1 Introduction

This analysis looks at the field data collected for the following 2017 MLRA project: - MLRA 111B - Glynwood B-slope Erosion; Northeastern IN

Spatially disaggregate the existing SSURGO polygons for Glynwood B-slope map units using ArcSIE, in order to separate different soil erosion phases. The current SSURGO maps join issues at the SSA boundaries, due to different erosion phases. This project is deemed relevant due to the current interest in Soil Health. Distinguishing the difference in erosion phases may have minimal impact on the majority of soil interpretations, but is believed to be significant in distinguishing crop yields.

2 Data Preparation

library(aqp)
library(soilDB)

library(reshape2)
library(ggplot2)
library(gridExtra)
library(knitr)

library(cluster)
library(caret)
library(party)
library(vegan)

library(rgdal)
library(sp)
library(mapview)

2.1 Point Data

# projectmapunit data from NASIS
project <- get_projectmapunit_from_NASIS()
project_nodups <- project[!duplicated(project$nationalmusym), c("nationalmusym", "muname")]

# MUPOLYGONs for the Project
gw_pol <- readOGR(dsn = paste0(ownCloud, "glynwood.shp"), layer = "glynwood")

# Soil Survey Areas
ssa <- readOGR(dsn = paste0(geodata, "soils/soilsa_a_nrcs.shp"), layer = "soilsa_a_nrcs")

# Series Extent of Glynwood from SoilWeb
gw_series <- seriesExtent("Glynwood")

# field Data
gw <- read.csv(paste0(ownCloud, "Pts_gnbero_27Jan17.csv"))

gw_sp <- gw
coordinates(gw_sp) <- ~ long + lat
proj4string(gw_sp) <- CRS("+init=epsg:4326")
gw_sp <- spTransform(gw_sp, CRS(proj4string(gw_pol)))

# spatial overlay field data with mupolygons and merge with nasis mapunits
vars <- c("AREASYMBOL", "nationalmu")
gw <- cbind(gw, over(gw_sp, gw_pol)[vars])
gw <- merge(gw, project_nodups, by.x = "nationalmu", by.y = "nationalmusym", all.x = TRUE, sort = FALSE)


# Extract erosion phases from NASIS and combine component and phase
ero_labels <- c("undisturbed", "slight", "moderate", "severe")

gw <- within(gw, {
  EroClassFD  = factor(EroClassFD, levels = 0:3, labels = ero_labels)
  EroClassSIE = factor(EroClassSIE, levels = 0:3, labels = ero_labels)
  EroClassFD2 = ifelse(EroClassFD == "severe", "severe", "slight")

  EroClassNASIS = NA
  EroClassNASIS[grepl("eroded", muname)] = "eroded"
  EroClassNASIS[grepl("sev.|severely", muname)] = "sev.eroded"
  EroClassNASIS[!grepl("eroded", muname)] = "non.eroded"
  
  soilname2 = soilname
  soilname2 = ifelse(soilname2 %in% c("Glynwood", "Morley", "Shinrock", "Rawson", "Mississinewa"), "Glynwood", soilname2)
  soilname2 = ifelse(soilname2 %in% c("Blount", "Elliott"), "Blount", soilname2)
  soilname2 = ifelse(soilname2 %in% c("Pewamo", "Pandora", "Mermill"), "Pewamo", soilname2)
  soilname3 = paste0(soilname2, ifelse(soilname2 == "Glynwood", paste0("-", EroClassFD2), ""))
  })
gw <- transform(gw,
                rgb = munsell2rgb(mxhue, mxvalue, mxchroma, return_triplets = TRUE)
                )
gw_sp <- gw
coordinates(gw_sp) <- ~ long + lat
proj4string(gw_sp) <- CRS("+init=epsg:4326")
gw_sp <- spTransform(gw_sp, CRS(proj4string(gw_pol)))

2.2 Spatial Data

The geodata from the Glynwood points was extracted from several rasters at various resolutions. The data using to generate the ArcSIE model came from a DEM with a resolution of 15-feet. The other used came from the 10-meter USGS NED, which was primarily resampled from LiDAR.

# Extract data from rasters
library(raster)

# NW files
fd <- paste0(geodata, "project_data/11FIN/PointDataEval/")
dd <- c("slope10",
        "procrv10",
        "plncrv10",
        "maxcrv10",
        "mincrv10",
        "relpos_r5",
        "wetness_mp"
        )
fp <- paste0(fd, "Mosaic_NW_pts/Derivatives/", dd, "/", "w001001.adf")
rs <- stack(fp)
names(rs) <- dd
proj4string(rs) <- CRS("+init=epsg:2965")
gd_nw <- extract(rs, gw_sp, df = TRUE, sp = TRUE)@data
gd_nw <- subset(gd_nw, !is.na(slope10))

# SE files
fp <- paste0(fd, "Mosaic_SE_pts/Derivatives/", dd, "/", "w001001.adf")
rs <- stack(fp)
names(rs) <- dd
proj4string(rs) <- CRS("+init=epsg:2965")
gd_se <- extract(rs, gw_sp, df = TRUE, sp = TRUE)@data
gd_se <- subset(gd_se, !is.na(slope10))

gd15ft <- rbind(gd_nw, gd_se)
rm(gd_nw, gd_se)
write.csv(gd15ft, file = paste0(ownCloud, "geodata_15ft_extract.csv"))


# Region 11 files

subset_rasters <- function(input, output) {
  cat(paste0(input, "\n"))
  gdal_translate(
    src_dataset = input,
    dst_dataset = output,
    projwin = c(bb[1,1], bb[2,2], bb[1,2], bb[2,1]),
    of = "GTiff",
    a_nodata = -99999,
    overwrite = TRUE,
    verbose = TRUE
    )
  }

warp_rasters <- function(input, output){
  cat(paste0(input,"\n"))
  gdalwarp(
    srcfile = input,
    dstfile = output,
    te = bb,
    s_srs = CRSargs(CRS("+init=epsg:5070")),
    t_srs = CRSargs(CRS("+init=epsg:5070")),
    r = "bilinear",
    tr = c(10, 10),
    of = "GTiff",
    overwrite = TRUE,
    verbose = TRUE 
    )
  }

fd <- paste0(geodata, "project_data/11FIN/sdat/")
dd2 <- c("ned10m_11FIN.sdat",
        "ned10m_11FIN_aspect5.sdat",
        "ned10m_11FIN_slope5.sdat",
        "ned10m_11FIN_cupro5.sdat",
        "ned10m_11FIN_cutan5.sdat",
        "ned30m_11FIN_mvalleys.sdat",
        "ned30m_11FIN_wetness.sdat",
        "ned30m_11FIN_z2stream.sdat",
        "nlcd30m_11FIN_lulc2011.sdat"
        )
dd_names <- c("elev", "aspect5", "slope5", "kp5", "kt5", "mvalley", "wetness2", "z2streams", "lulc")
dd <- paste0(fd, dd2)

input <- dd
output <- paste0(geodata, "project_data/11FIN/glynwood/", gsub(".sdat", ".tif", dd2))
bb <- bbox(gw_sp <- spTransform(gw_sp, CRS("+init=epsg:5070")))

mapply(subset_rasters, input, output)
mapply(subset_rasters, input = paste0(geodata, "project_data/11FIN/nlcd30m_11FIN_lulc2011.tif"), output = paste0(geodata, "project_data/11FIN/glynwood/nlcd30m_11FIN_lulc2011.tif"))

mapply(warp_rasters, input = output, output = gsub(".tif", "2.tif", gsub("30m", "10m", output)))

dd <- output
 
rs10m <- stack(dd[grepl("10m", dd)])
names(rs10m) <- dd_names[1:5]
rs30m <- stack(dd[grepl("30m", dd)])
names(rs30m) <- dd_names[6:9]

rs10m <- stack(gsub(".tif", "2.tif", gsub("30m", "10m", output)))
names(rs10m) <- dd_names

gw <- as.data.frame(extract(rs10m, gw_sp, df = TRUE, sp = TRUE))
gd30m <- as.data.frame(extract(rs30m, gw_sp, df = TRUE))
gw <- cbind(gd10m, gd30m[, -1])
rm(gd10m, gd30m)


# Save data
save(gw, gw_sp, gw_pol, gw_series, ssa, ero_labels, file = paste0(ownCloud, "Pts_gnbero_27Jan17_geodata.RData"))

3 Map Units vs Field Observations

# Load cached dataset
load(paste0(ownCloud, "Pts_gnbero_27Jan17_geodata.RData"))

ssa <- subset(ssa, areasymbol %in% unique(gw$AREASYMBOL))
ssa <- spTransform(ssa, CRS("+init=epsg:4326"))

# Create interactive map
mapView(gw_series) + ssa + gw_sp
vals2 <- c("EroClassFD", "EroClassNASIS", "nationalmu", "AREASYMBOL")
gw_sub <- gw[vals2]

# Frequency of field observation vs map unit
# Duplicate the data for each REASYMBOL and relabel MLRA
gw_sub2 <- by(gw_sub, gw_sub$nationalmu, function(x) {
  x[vals2][1, ]
  x[, "AREASYMBOL"] <- "MLRA"
  return(x)
  })
gw_sub2 <- do.call("rbind", gw_sub2)
gw_sub <- rbind(gw_sub, gw_sub2)
gw_sub$natmuSsaEro <-  with(gw_sub,
                            paste0(nationalmu, "-", AREASYMBOL, "-", EroClassNASIS)
                            )
test <- xtabs(~ natmuSsaEro + EroClassFD, data = gw_sub)
kable(test, caption = "Frequence by MUSYM-SSA-EROSION")
Frequence by MUSYM-SSA-EROSION
undisturbed slight moderate severe
2t6ll-IN009-sev.eroded 6 19 18 3
2t6ll-IN053-sev.eroded 3 13 20 11
2t6ll-IN075-sev.eroded 6 8 4 5
2t6ll-MLRA-sev.eroded 15 40 42 19
2t6lm-IN009-sev.eroded 2 10 10 0
2t6lm-IN053-sev.eroded 1 6 8 12
2t6lm-IN075-sev.eroded 3 1 11 9
2t6lm-IN179-sev.eroded 5 12 0 13
2t6lm-MLRA-sev.eroded 11 29 29 34
2v4bn-IN069-eroded 4 3 6 4
2v4bn-MLRA-eroded 4 3 6 4
2v4bp-IN179-eroded 0 3 0 2
2v4bp-MLRA-eroded 0 3 0 2
5jjt-IN035-sev.eroded 1 1 2 9
5jjt-MLRA-sev.eroded 1 1 2 9
NA-NA-non.eroded 6 9 11 22
kable(round(prop.table(test, 1) * 100), caption = "Percent by MUSYM-SSA-EROSION")
Percent by MUSYM-SSA-EROSION
undisturbed slight moderate severe
2t6ll-IN009-sev.eroded 13 41 39 7
2t6ll-IN053-sev.eroded 6 28 43 23
2t6ll-IN075-sev.eroded 26 35 17 22
2t6ll-MLRA-sev.eroded 13 34 36 16
2t6lm-IN009-sev.eroded 9 45 45 0
2t6lm-IN053-sev.eroded 4 22 30 44
2t6lm-IN075-sev.eroded 12 4 46 38
2t6lm-IN179-sev.eroded 17 40 0 43
2t6lm-MLRA-sev.eroded 11 28 28 33
2v4bn-IN069-eroded 24 18 35 24
2v4bn-MLRA-eroded 24 18 35 24
2v4bp-IN179-eroded 0 60 0 40
2v4bp-MLRA-eroded 0 60 0 40
5jjt-IN035-sev.eroded 8 8 15 69
5jjt-MLRA-sev.eroded 8 8 15 69
NA-NA-non.eroded 12 19 23 46

Several of counties phased severely eroded, are not dominanted by field observations classified as severely eroded.

4 Accuracy Assessment of the ArcSIE Predictions

cm <- confusionMatrix(data = gw$EroClassSIE, reference = gw$EroClassFD)
print(cm)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    undisturbed slight moderate severe
##   undisturbed           0      0        0      0
##   slight                6      8       11      9
##   moderate             24     39       53     37
##   severe                7     37       25     44
## 
## Overall Statistics
##                                           
##                Accuracy : 0.35            
##                  95% CI : (0.2961, 0.4069)
##     No Information Rate : 0.3             
##     P-Value [Acc > NIR] : 0.03521         
##                                           
##                   Kappa : 0.0767          
##  Mcnemar's Test P-Value : 1.555e-13       
## 
## Statistics by Class:
## 
##                      Class: undisturbed Class: slight Class: moderate
## Sensitivity                      0.0000       0.09524          0.5955
## Specificity                      1.0000       0.87963          0.5261
## Pos Pred Value                      NaN       0.23529          0.3464
## Neg Pred Value                   0.8767       0.71429          0.7551
## Prevalence                       0.1233       0.28000          0.2967
## Detection Rate                   0.0000       0.02667          0.1767
## Detection Prevalence             0.0000       0.11333          0.5100
## Balanced Accuracy                0.5000       0.48743          0.5608
##                      Class: severe
## Sensitivity                 0.4889
## Specificity                 0.6714
## Pos Pred Value              0.3894
## Neg Pred Value              0.7540
## Prevalence                  0.3000
## Detection Rate              0.1467
## Detection Prevalence        0.3767
## Balanced Accuracy           0.5802
test <- as.data.frame(cm$table)

ggplot(test, aes(x = Reference, y = Freq, fill = Prediction)) +
  geom_bar(stat = "identity") +
  coord_flip()

The accuracy of the current ArcSIE model appears to be low, according to several metrics. None of the classes achieve a positive predictive value of > 50% for the reference data.

4.1 Boxplots

soil_vals <- c("hzthk", "SolumDp", "CaCO3Dp", "claytotest", "firstbtclay", "mxvalue", "mxchroma")
geo_vals1 <- c("SlopeSIE", "ProfCrv", "PlanCrv", "relpos", "wetness")
geo_vals2 <- c("slope5", "kt5", "kp5", "z2streams", "wetness2", "mvalley")

vals <- c(soil_vals, geo_vals1, geo_vals2)

gw_lo1 <- melt(gw, id.vars = "EroClassFD", measure.vars = vals)
gw_lo2 <- melt(gw, id.vars = "EroClassSIE", measure.vars = vals)

names(gw_lo1)[1] <- "EroClass"
gw_lo1$method <- "FD"
names(gw_lo2)[1] <- "EroClass"
gw_lo2$method <- "SIE"
gw_lo <- rbind(gw_lo1, gw_lo2)

ggplot(gw_lo, aes(x = EroClass, y = value)) +
  geom_boxplot() +
  facet_wrap(~ paste(variable, method), scales="free", ncol = 4) +
  coord_flip()

An exploratory analysis shows a considerable amount of overlap exists between the field determined (FD) erosion classes and measurable soil properties. In comparison the FD and SIE (Soil Inference Engine) erosion classes show different patterns within the boxplots, further suggesting that the SIE classes aren’t capturing the field observations accurately.

4.2 Scatterplots

soil_vals2 <- c("hzthk", "SolumDp", "CaCO3Dp", "claytotest", "firstbtclay") # excluded color, only observed a narrow range thus small differences swamp everthing else
vals <- c(soil_vals2, geo_vals2)

test <- gw[, vals]
test_d <- daisy(scale(test), metric = "gower")
test_mds <- metaMDS(test_d, distance = "gower", autotransform = FALSE, trace = FALSE)
test_pts <- cbind(as.data.frame(test_mds$points), EroClassFD = gw$EroClassFD)

g1 <- ggplot(gw, aes(x = hzthk, y = SolumDp, color = EroClassFD)) +
  geom_point(cex = 2, alpha = 0.75) +
  theme(aspect.ratio = 1)
g2 <- ggplot(test_pts, aes(x = MDS1, y = MDS2, color = EroClassFD)) +
  geom_point(cex = 2, alpha = 0.75) +
  theme(aspect.ratio = 1)
grid.arrange(g1, g2, ncol = 2)

According to the scatterplot above it appears that only the severe and slight classes are separatable. The moderate erosion class seems to overlap the most with slight. Both the 15-feet and 10-meter DEM derivatives were evaluated, but the results are similar. The overlap in the FD classes is likely due to bias amongst the soil scientists who collected the data.

4.3 Classification Trees

test <- subset(gw, !is.na(EroClassFD))

test_ct <- ctree(EroClassFD ~ ., data = test[, c("EroClassFD", soil_vals)])
plot(test_ct)

confusionMatrix(data = predict(test_ct, type = "response"), reference = test$EroClassFD)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    undisturbed slight moderate severe
##   undisturbed          30     20       11      2
##   slight                3     48        5      1
##   moderate              3     17       67     27
##   severe                1      0        7     60
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6788          
##                  95% CI : (0.6229, 0.7311)
##     No Information Rate : 0.298           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.567           
##  Mcnemar's Test P-Value : 1.943e-06       
## 
## Statistics by Class:
## 
##                      Class: undisturbed Class: slight Class: moderate
## Sensitivity                     0.81081        0.5647          0.7444
## Specificity                     0.87547        0.9585          0.7783
## Pos Pred Value                  0.47619        0.8421          0.5877
## Neg Pred Value                  0.97071        0.8490          0.8777
## Prevalence                      0.12252        0.2815          0.2980
## Detection Rate                  0.09934        0.1589          0.2219
## Detection Prevalence            0.20861        0.1887          0.3775
## Balanced Accuracy               0.84314        0.7616          0.7614
##                      Class: severe
## Sensitivity                 0.6667
## Specificity                 0.9623
## Pos Pred Value              0.8824
## Neg Pred Value              0.8718
## Prevalence                  0.2980
## Detection Rate              0.1987
## Detection Prevalence        0.2252
## Balanced Accuracy           0.8145

5 Hierachical Clusters

In order to see if more separation can be achieved amongst the erosion classes a hierachical classifition was peformed. Four unsupervised classes were chosen and manually matched to the FD classes.

test_c <- hclust(test_d, method = "ward")
plot(test_c, labels = gw$upedonid)
rect.hclust(test_c, k = 4)

clusters <- cbind(gw, 
                  test_pts[, 1:2], 
                  clusters = factor(cutree(test_c, k = 4),
                                    levels = c(2, 3, 1, 4),
                                    labels = ero_labels
                                    )
                  )

xtabs(~ EroClassFD + clusters, data = clusters)
##              clusters
## EroClassFD    undisturbed slight moderate severe
##   undisturbed          14     14        6      3
##   slight               12     48       22      3
##   moderate              7     24       25     34
##   severe                4      0       17     69

5.1 Scatter Plots

g1 <- ggplot(clusters, aes(x = MDS1, y = MDS2, col = EroClassFD)) +
  geom_point(cex = 2, alpha = 0.75) +
  theme(aspect.ratio = 1)
g2 <- ggplot(clusters, aes(x = MDS1, y = MDS2, col = clusters), main = "test") +
  geom_point(cex = 2, alpha = 0.75) +
  theme(aspect.ratio = 1)
grid.arrange(g1, g2, ncol = 2)

In comparison the hierarchical clusters have less overlap when viewed along the multidimensional scaled axes, but still does not seem to separate the moderate class.

5.2 Box Plots

gw_lo3 <- melt(clusters, id.vars = "clusters", measure.vars = c(soil_vals, geo_vals2))

names(gw_lo3)[1] <- "EroClass"
gw_lo3$method <- "clusters"
gw_lo1 <- subset(gw_lo1, ! variable %in%  c("relpos", "wetness", "SlopeSIE"))
gw_lo <- rbind(gw_lo1, gw_lo3)

ggplot(gw_lo, aes(x = EroClass, y = value)) +
  geom_boxplot() +
  facet_wrap(~ paste(variable, method), scales="free", ncol = 4) + 
  coord_flip()

5.3 Classification Tree

test2 <- ctree(clusters ~ ., data = clusters[, c("clusters", soil_vals)])
plot(test2)

confusionMatrix(data = predict(test2, type = "response"), reference = clusters$clusters)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    undisturbed slight moderate severe
##   undisturbed          36     14        1      0
##   slight                1     52       10      2
##   moderate              0     19       48     23
##   severe                0      1       11     84
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7285          
##                  95% CI : (0.6746, 0.7778)
##     No Information Rate : 0.3609          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6302          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: undisturbed Class: slight Class: moderate
## Sensitivity                      0.9730        0.6047          0.6857
## Specificity                      0.9434        0.9398          0.8190
## Pos Pred Value                   0.7059        0.8000          0.5333
## Neg Pred Value                   0.9960        0.8565          0.8962
## Prevalence                       0.1225        0.2848          0.2318
## Detection Rate                   0.1192        0.1722          0.1589
## Detection Prevalence             0.1689        0.2152          0.2980
## Balanced Accuracy                0.9582        0.7722          0.7523
##                      Class: severe
## Sensitivity                 0.7706
## Specificity                 0.9378
## Pos Pred Value              0.8750
## Neg Pred Value              0.8786
## Prevalence                  0.3609
## Detection Rate              0.2781
## Detection Prevalence        0.3179
## Balanced Accuracy           0.8542

6 Statistical Modeling

Below several statistical models are evaluated to see if a more accurate model can be developed.

6.1 FD Classes vs DEM Derivatives

test3 <- ctree(EroClassFD ~ ., data = clusters[, c("EroClassFD", geo_vals2)])
plot(test3)

confusionMatrix(data = predict(test3, type = "response"), reference = clusters$EroClassFD)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    undisturbed slight moderate severe
##   undisturbed           0      0        0      0
##   slight                0      0        0      0
##   moderate             18     12       21     13
##   severe               19     73       69     77
## 
## Overall Statistics
##                                          
##                Accuracy : 0.3245         
##                  95% CI : (0.272, 0.3805)
##     No Information Rate : 0.298          
##     P-Value [Acc > NIR] : 0.1724         
##                                          
##                   Kappa : 0.0377         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: undisturbed Class: slight Class: moderate
## Sensitivity                      0.0000        0.0000         0.23333
## Specificity                      1.0000        1.0000         0.79717
## Pos Pred Value                      NaN           NaN         0.32812
## Neg Pred Value                   0.8775        0.7185         0.71008
## Prevalence                       0.1225        0.2815         0.29801
## Detection Rate                   0.0000        0.0000         0.06954
## Detection Prevalence             0.0000        0.0000         0.21192
## Balanced Accuracy                0.5000        0.5000         0.51525
##                      Class: severe
## Sensitivity                 0.8556
## Specificity                 0.2406
## Pos Pred Value              0.3235
## Neg Pred Value              0.7969
## Prevalence                  0.2980
## Detection Rate              0.2550
## Detection Prevalence        0.7881
## Balanced Accuracy           0.5481
test3 <- cforest(as.factor(EroClassFD) ~ ., data = gw[, c("EroClassFD", geo_vals2)])
varimp(test3)
##       slope5          kt5          kp5    z2streams     wetness2 
##  0.032558559  0.014810811  0.006414414  0.004432432  0.016270270 
##      mvalley 
## -0.001873874
confusionMatrix(data = predict(test3, type = "response", OOB = TRUE), reference = gw$EroClassFD)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    undisturbed slight moderate severe
##   undisturbed           3      8        4      4
##   slight                8     27       24     17
##   moderate             15     25       34     15
##   severe               11     25       28     54
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3907          
##                  95% CI : (0.3354, 0.4483)
##     No Information Rate : 0.298           
##     P-Value [Acc > NIR] : 0.0003594       
##                                           
##                   Kappa : 0.1505          
##  Mcnemar's Test P-Value : 0.0194217       
## 
## Statistics by Class:
## 
##                      Class: undisturbed Class: slight Class: moderate
## Sensitivity                    0.081081        0.3176          0.3778
## Specificity                    0.939623        0.7742          0.7406
## Pos Pred Value                 0.157895        0.3553          0.3820
## Neg Pred Value                 0.879859        0.7434          0.7371
## Prevalence                     0.122517        0.2815          0.2980
## Detection Rate                 0.009934        0.0894          0.1126
## Detection Prevalence           0.062914        0.2517          0.2947
## Balanced Accuracy              0.510352        0.5459          0.5592
##                      Class: severe
## Sensitivity                 0.6000
## Specificity                 0.6981
## Pos Pred Value              0.4576
## Neg Pred Value              0.8043
## Prevalence                  0.2980
## Detection Rate              0.1788
## Detection Prevalence        0.3907
## Balanced Accuracy           0.6491

6.2 Clusters vs DEM Derivatives

test4 <- ctree(clusters ~ ., data = clusters[, c("clusters", geo_vals2)])
plot(test4)

confusionMatrix(data = predict(test4, type = "response"), reference = clusters$clusters)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    undisturbed slight moderate severe
##   undisturbed          12      3        1      2
##   slight                0     31        1     14
##   moderate             13      5       63     12
##   severe               12     47        5     81
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6192          
##                  95% CI : (0.5618, 0.6742)
##     No Information Rate : 0.3609          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4596          
##  Mcnemar's Test P-Value : 7.988e-08       
## 
## Statistics by Class:
## 
##                      Class: undisturbed Class: slight Class: moderate
## Sensitivity                     0.32432        0.3605          0.9000
## Specificity                     0.97736        0.9306          0.8707
## Pos Pred Value                  0.66667        0.6739          0.6774
## Neg Pred Value                  0.91197        0.7852          0.9665
## Prevalence                      0.12252        0.2848          0.2318
## Detection Rate                  0.03974        0.1026          0.2086
## Detection Prevalence            0.05960        0.1523          0.3079
## Balanced Accuracy               0.65084        0.6455          0.8853
##                      Class: severe
## Sensitivity                 0.7431
## Specificity                 0.6684
## Pos Pred Value              0.5586
## Neg Pred Value              0.8217
## Prevalence                  0.3609
## Detection Rate              0.2682
## Detection Prevalence        0.4801
## Balanced Accuracy           0.7058
test4 <- cforest(clusters ~ ., data = clusters[, c("clusters", geo_vals2)])
varimp(test4)
##      slope5         kt5         kp5   z2streams    wetness2     mvalley 
## 0.016828829 0.022774775 0.001027027 0.001135135 0.009315315 0.196288288
confusionMatrix(data = predict(test4, type = "response", OOB=TRUE), reference = clusters$clusters)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    undisturbed slight moderate severe
##   undisturbed           9      1        3      4
##   slight                4     40        1     25
##   moderate             13      4       61     11
##   severe               11     41        5     69
## 
## Overall Statistics
##                                          
##                Accuracy : 0.5927         
##                  95% CI : (0.535, 0.6486)
##     No Information Rate : 0.3609         
##     P-Value [Acc > NIR] : 2.413e-16      
##                                          
##                   Kappa : 0.4249         
##  Mcnemar's Test P-Value : 0.003769       
## 
## Statistics by Class:
## 
##                      Class: undisturbed Class: slight Class: moderate
## Sensitivity                     0.24324        0.4651          0.8714
## Specificity                     0.96981        0.8611          0.8793
## Pos Pred Value                  0.52941        0.5714          0.6854
## Neg Pred Value                  0.90175        0.8017          0.9577
## Prevalence                      0.12252        0.2848          0.2318
## Detection Rate                  0.02980        0.1325          0.2020
## Detection Prevalence            0.05629        0.2318          0.2947
## Balanced Accuracy               0.60653        0.6631          0.8754
##                      Class: severe
## Sensitivity                 0.6330
## Specificity                 0.7047
## Pos Pred Value              0.5476
## Neg Pred Value              0.7727
## Prevalence                  0.3609
## Detection Rate              0.2285
## Detection Prevalence        0.4172
## Balanced Accuracy           0.6688

6.3 Soil series and phases

Try and model the soil components and phases separately. The data is highly unbalanced. Create separate logistic regression models for similar minor components.

#gw$weights <- ifelse(gw$soilname == "Glynwood", 0.1, 1)
gw$gw_severe <- ifelse(gw$soilname3 == "Glynwood-severe", TRUE, FALSE)

test4 <- cforest(as.factor(gw_severe) ~ elev + slope5 + kt5 + kp5 + wetness2 + mvalley + z2streams, data = gw)
sort(varimp(test4), decreasing = TRUE)
##        slope5          elev      wetness2           kt5     z2streams 
##  0.0274774775  0.0251171171  0.0157297297  0.0148468468  0.0022162162 
##           kp5       mvalley 
##  0.0009009009 -0.0003063063
confusionMatrix(data = predict(test4, type = "response", OOB = TRUE), reference = gw$gw_severe, positive = "TRUE")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   197   65
##      TRUE     15   25
##                                          
##                Accuracy : 0.7351         
##                  95% CI : (0.6815, 0.784)
##     No Information Rate : 0.702          
##     P-Value [Acc > NIR] : 0.1152         
##                                          
##                   Kappa : 0.2464         
##  Mcnemar's Test P-Value : 4.293e-08      
##                                          
##             Sensitivity : 0.27778        
##             Specificity : 0.92925        
##          Pos Pred Value : 0.62500        
##          Neg Pred Value : 0.75191        
##              Prevalence : 0.29801        
##          Detection Rate : 0.08278        
##    Detection Prevalence : 0.13245        
##       Balanced Accuracy : 0.60351        
##                                          
##        'Positive' Class : TRUE           
## 
test3 <- glm(as.factor(gw_severe) ~ elev + slope5 + kt5, data = gw, family = "binomial", na.action = na.exclude)
confusionMatrix(data = predict(test3, type = "response") > 0.4, reference = gw$gw_severe, positive = "TRUE")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   174   49
##      TRUE     35   40
##                                           
##                Accuracy : 0.7181          
##                  95% CI : (0.6634, 0.7685)
##     No Information Rate : 0.7013          
##     P-Value [Acc > NIR] : 0.2865          
##                                           
##                   Kappa : 0.2953          
##  Mcnemar's Test P-Value : 0.1561          
##                                           
##             Sensitivity : 0.4494          
##             Specificity : 0.8325          
##          Pos Pred Value : 0.5333          
##          Neg Pred Value : 0.7803          
##              Prevalence : 0.2987          
##          Detection Rate : 0.1342          
##    Detection Prevalence : 0.2517          
##       Balanced Accuracy : 0.6410          
##                                           
##        'Positive' Class : TRUE            
## 
summary(test3)
## 
## Call:
## glm(formula = as.factor(gw_severe) ~ elev + slope5 + kt5, family = "binomial", 
##     data = gw, na.action = na.exclude)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5512  -0.8366  -0.6028   1.0314   2.4255  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.953655   2.044889  -4.379 1.19e-05 ***
## elev         0.024425   0.007071   3.455 0.000551 ***
## slope5       0.388811   0.109621   3.547 0.000390 ***
## kt5          0.089968   0.027948   3.219 0.001286 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 363.39  on 297  degrees of freedom
## Residual deviance: 327.73  on 294  degrees of freedom
##   (4 observations deleted due to missingness)
## AIC: 335.73
## 
## Number of Fisher Scoring iterations: 4
gw$predicted <- predict(test3, type = "response") > 0.4
gw_lo1 <- melt(gw, id.vars = "gw_severe", measure.vars = vals)
gw_lo2 <- melt(gw, id.vars = "predicted", measure.vars = vals)
gw_lo2 <- na.exclude(gw_lo2)

names(gw_lo1)[1] <- "EroClass"
gw_lo1$method <- "FD"
names(gw_lo2)[1] <- "EroClass"
gw_lo2$method <- "GLM"
gw_lo <- rbind(gw_lo1, gw_lo2)

ggplot(gw_lo, aes(x = EroClass, y = value)) +
  geom_boxplot() +
  facet_wrap(~ paste(variable, method), scales="free", ncol = 4) +
  coord_flip()

predfun <- function(model, data) {
  v <- predict(model, data, type = "response")
  cbind(
    p = as.vector(v$fit)
    )
}
r <- predict(rs10m, test3, fun = predfun, index = 1:2, progress = "text")
writeRaster(r[[1]], "C:/workspace/severe_erosion.tif", overwrite = TRUE, progress = "text")

r <-predict(rs10m, test4, type='response', progress='text')
writeRaster(r[[1]], "C:/workspace/severe_erosion_cf.tif", overwrite = TRUE, progress = "text")

7 Summary

7.1 Issues

  • The field data does not confirm the map units phased severely eroded.
  • The previous analysis did not exclude the minor components.
  • An exploratory analysis illustrated a considerable amount of overlap exists between the field determined (FD) erosion classes and measurable soil properties. However, the FD erosion classes were similarly predictive as the results of a cluster analysis. The classes derived by cluster analysis appeared to overlap slightly and moderately eroded, and were best separated by depth to CaCO3, while the FD erosion classes were best split on A horizon thickness.
  • The accuracy of the current ArcSIE model appears to be low, according to several metrics.
  • A preliminary analysis found a random forest model to be 20% more accurate than the ArcSIE model.
  • Re-delineating the SSURGO map units will most likely result in numerous small delineations.

7.2 Recommendations

  • If disaggregating the SSURGO polygons is deemed impractical or inaccurate, re-label the Glynwood B-slope map units (e.g. severely eroded) to the appropriate map unit concept (e.g moderately eroded).
  • Evaluate ways to improve the accuracy of the ArcSIE model, or test alternative models such as random forests.
  • Conduct a field review May 15th (?) to evaluate the erosion phase concepts and spatial model.
  • Evaluate the effect of lumping erosion classes (e.g. slightly and moderately).
  • Access yield data from a producer in order to validate the hypothesis that the erosion phases help explain yield variability.